This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
1 Downloaded the dataset on restaurant inspection in csv format from https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j Named the file NYRestaurantInspection2023.csv and read that CSV file.
nyc_rest_inspec <- read.csv(file ='/Users/vishwapatel/Downloads/NYRestaurantInspection2023.csv')
(1a) Formed a new data frame restricted to restaurants in Queens with cuisine equal to “Pizza”
#Restaurants in Queens with cuisine equal to "Pizza"
nyc_queens_pizza <- filter(nyc_rest_inspec, BORO == "Queens" & CUISINE.DESCRIPTION =="Pizza")
(1b) What are the 5 most frequently inspected restaurants (use the variable “DBA”) in the data frame?
# 5 most Frequently Inspected Pizza Restaurants in Queens
nyc_queens_pizza_frequent <- nyc_queens_pizza %>% group_by(DBA) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
head(5)
nyc_queens_pizza_frequent
5 most Frequently Inspected Pizza Restaurants in Queens are as above
# 5 most Frequently Inspected Restaurants
nyc_frequent <- nyc_rest_inspec %>% group_by(DBA) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
head(5)
nyc_frequent
5 most Frequently Inspected Restaurants are as above
(1c) On what dates has pizza parlor “SUSANO’S PIZZERIA & RESTAURANT” been inspected?
#Dates on which pizza parlor "SUSANO'S PIZZERIA & RESTAURANT" HAS been inspected
inspec_susano <- nyc_rest_inspec %>% filter(DBA == "SUSANO'S PIZZERIA & RESTAURANT")
inspec_susano$INSPECTION.DATE
## [1] "01/12/2023" "01/12/2023" "05/05/2022" "05/05/2022" "01/12/2023"
## [6] "05/05/2022" "05/05/2022"
#2
gapminder_2007_gini <- read_tsv('/Users/vishwapatel/Downloads/gapminder_2007_gini.tsv')
## Rows: 108 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): country, continent
## dbl (5): year, lifeExp, pop, gdpPercap, gini
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#2A
b_plot <- ggplot(data = gapminder_2007_gini) +
geom_boxplot(mapping = aes(continent, gini, colour = continent)) +
ggtitle("BOX PLOT
Comparing the distributions of the Gini coefficient in different continents")
ggplotly()
#2B
s_plot <- ggplot(data = gapminder_2007_gini) +
geom_point(aes(gini, lifeExp, colour = continent)) +
xlab("GINI index") + ylab("Life Expectancy in 2007")
ggtitle("SCATTERPLOT
showing relationship between the Gini coefficient and life expectancy in 2007, classified by continents")
## $title
## [1] "SCATTERPLOT\nshowing relationship between the Gini coefficient and life expectancy in 2007, classified by continents"
##
## attr(,"class")
## [1] "labels"
ggplotly()
From the scatterplot, we can see that there is no clear relationship between the Gini coefficient and life expectancy in 2007, classified by continents. The points are scattered across the plot, with no clear pattern. It appears that other factors, such as economic development and healthcare infrastructure, may have a larger impact on life expectancy.
library(gapminder)
gapminder <- gapminder
#Generated a data frame with a new variable called gdp by multiplying the population(pop) size by the gdp per capita(gdpPercap)
gapminder <- gapminder
gapminder_modified <- mutate(gapminder, gdp = pop * gdpPercap )
gdp_us_2007 <- filter(gapminder_modified, year == 2007 & country == "United States")
# gdp of the United States in 2007
us_gdp_2007 <- gdp_us_2007$gdpPercap * gdp_us_2007$pop
us_gdp_2007
## [1] 1.293446e+13
#New variable called gdp_ratio equal to the gdp divided by the gdp of the United States in 2007.
gapminder_modified_final <- mutate(gapminder_modified, gdp_ratio = gdp/us_gdp_2007 )
#Find the median gdp_ratio by continent and year, and then plot the median gdp_ratio over time, distinguishing the continents. Please use both points and lines for the plot.
gdp_median_ratio <- gapminder_modified_final %>% group_by(continent, year) %>% summarise(median =median(gdp_ratio))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
gdp_median_ratio
#Find the median gdp_ratio by continent and year, and then plot the median gdp_ratio over time, distinguishing the continents. Please use both points and lines for the plot.
gdp_ratio_plot <- ggplot(gdp_median_ratio, aes(year, median, colour = continent, group = continent)) +
geom_line() +
geom_point()
ggplotly()